Bergen Bysykkel (https://bergenbysykkel.no) is a company that rents out bicycles in Bergen. They also release data through an open API.

Load all needed packages

library(tidyverse)
library(jsonlite)
library(dplyr)
library(data.table)
library(magrittr)
library(assertthat)
library(tidyr)
library(dplyr)
library(purrr)
library(tidymodels)
library(lubridate)
library(sf)          # For spatial data, co-authoured by NHH Prof. Roger Bivand 
library(ggmap)       # For downloading streetmaps as ggplot2-objects.

Problem 1:

Download data for January 2021 in both .json and .csv-format. Use the lines below to load in the data frame from the .csv:

df_202101_csv <- read_csv("data/01.csv") %>% 
  mutate(across(c("start_station_latitude", "start_station_longitude", "end_station_latitude", "end_station_longitude"), function(x) signif(x, digits=15)))
dim(df_202101_csv)

## [1] 27225    13

(a) Read in the .json-file for January 2021. Write a pipeline that transforms it into a data frame with the same columns/format as the .csv-file. Save the data frame as df_202101_json.

df_202101_json <- fromJSON("data/01.json")

#Transfrom json dataframe with same format as csv dataframe

df_202101_json <- df_202101_json %>% 
  mutate(started_at = as.POSIXct(.$started_at,tz="UTC"),
         ended_at = as.POSIXct(.$ended_at,tz="UTC"),
         start_station_id = as.double(.$start_station_id ),
         end_station_id = as.double(.$end_station_id),
         duration = as.double(.$duration) 
  ) %>% 
  mutate(across(where(is.character), str_trim)) %>% 
  mutate(across(c("start_station_latitude", "start_station_longitude", "end_station_latitude", "end_station_longitude"), function(x) signif(x, digits=15)))
  
dim(df_202101_json)

## [1] 27225    13

(b) Write a test that checks that df_202101_json and df_202101_csv should have the same columns, and the same values in all columns.(Hint: Use anti_join() from the dplyr package).

anti_join(df_202101_csv, df_202101_json)

## # A tibble: 0 × 13
## # … with 13 variables: started_at <dttm>, ended_at <dttm>, duration <dbl>,
## #   start_station_id <dbl>, start_station_name <chr>,
## #   start_station_description <chr>, start_station_latitude <dbl>,
## #   start_station_longitude <dbl>, end_station_id <dbl>,
## #   end_station_name <chr>, end_station_description <chr>,
## #   end_station_latitude <dbl>, end_station_longitude <dbl>

Problem 2:

Create a data set with the following properties

• The data set should have one observation for every station, date and hour of day.

• There should be one column n_rides, counting how many rides started at the station at the given day/hour.

• The columns of the final data set should be:

– start_station_id (dbl): station ID.

– floor_start_dh (POSIXct, e.g. ’2021-04-01 13:00:00”): giving the start datetime of the row.

– start_hour (factor): 0-23 numbers indicating which hour an observation belongs to

– weekday_start (factor): numeric representation of weekdays

– n_rides (dbl): count of rides each hour, i.e. a count of all rides from floor_start_dh to floor_start_dh + 1 hour.

• There should be no observations before/after the stations were operational. Use your judgment to assess what the valid date ranges are for each station.

Read in all bike rides from 01 Jan 2021 to 19 Dec 2021

# Get a List of all files in directory named with a key word, say all `.csv` files
filenames <- list.files("data", pattern=".csv", full.names=TRUE)
# read and row bind all data sets
bike_rides_2021_data <- rbindlist(lapply(filenames,fread))

Make a combination of observations for every station, date and hour of day.

start_station_id = as.double(unique(bike_rides_2021_data$start_station_id))
floor_start_dh <- seq(as.POSIXct("2021-01-01 00:00:00"), as.POSIXct("2021-12-19 23:00:00"), by = "hours")
df <- 
  expand.grid(floor_start_dh, start_station_id) %>%
  rename(floor_start_dh = Var1,
         start_station_id = Var2) %>% 
  arrange(start_station_id)
head(df,10)

##         floor_start_dh start_station_id
## 1  2021-01-01 00:00:00                2
## 2  2021-01-01 01:00:00                2
## 3  2021-01-01 02:00:00                2
## 4  2021-01-01 03:00:00                2
## 5  2021-01-01 04:00:00                2
## 6  2021-01-01 05:00:00                2
## 7  2021-01-01 06:00:00                2
## 8  2021-01-01 07:00:00                2
## 9  2021-01-01 08:00:00                2
## 10 2021-01-01 09:00:00                2

## Count of rides for each hour 
bike_rides_2021 <- bike_rides_2021_data %>% 
  mutate(start_date = format(as.POSIXct(.$started_at, '%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%00:%00')) %>% 
  group_by(start_station_id, start_date) %>%
  summarise(n_rides = n())
# Change the class of start_date into date type (instead of character)
bike_rides_2021$start_date <- as.POSIXct(bike_rides_2021$start_date )
bike_rides_2021

## # A tibble: 279,339 × 3
## # Groups:   start_station_id [106]
##    start_station_id start_date          n_rides
##               <int> <dttm>                <int>
##  1                2 2021-01-01 19:00:00       1
##  2                2 2021-01-04 13:00:00       1
##  3                2 2021-01-04 14:00:00       4
##  4                2 2021-01-05 12:00:00       2
##  5                2 2021-01-06 09:00:00       1
##  6                2 2021-01-07 06:00:00       1
##  7                2 2021-01-07 07:00:00       1
##  8                2 2021-01-07 12:00:00       1
##  9                2 2021-01-07 13:00:00       2
## 10                2 2021-01-07 15:00:00       1
## # … with 279,329 more rows

#join the data and fill the NA value (no ride in that hour) as "0"
df_agg <- left_join(df, bike_rides_2021, by = c("start_station_id" = "start_station_id", "floor_start_dh" = "start_date"))
df_agg [is.na(df_agg )] <- 0

# create weekday start as 1-> 7 (Sunday=1 to Saturday=7), start_hour as 0 -> 23
df_agg <- df_agg %>% 
  mutate(start_hour=as.factor(hour(floor_start_dh)),weekday_start=as.factor(wday(floor_start_dh)))
head(df_agg,10)

##         floor_start_dh start_station_id n_rides start_hour weekday_start
## 1  2021-01-01 00:00:00                2       0          0             6
## 2  2021-01-01 01:00:00                2       0          1             6
## 3  2021-01-01 02:00:00                2       0          2             6
## 4  2021-01-01 03:00:00                2       0          3             6
## 5  2021-01-01 04:00:00                2       0          4             6
## 6  2021-01-01 05:00:00                2       0          5             6
## 7  2021-01-01 06:00:00                2       0          6             6
## 8  2021-01-01 07:00:00                2       0          7             6
## 9  2021-01-01 08:00:00                2       0          8             6
## 10 2021-01-01 09:00:00                2       0          9             6

Name the data set df_agg and show that the data set passes 2 tests below:

assert_that(df_agg %>%
              group_by(start_station_id, floor_start_dh) %>%
              summarise(n = n()) %>%
              ungroup() %>%
              summarise(max_n = max(n)) %$%
              max_n == 1,
            msg = "Duplicates on stations/hours/dates")

## [1] TRUE

assert_that(df_agg %>%
              group_by(start_station_id) %>%
              mutate(
                timediff = floor_start_dh - lag(floor_start_dh,
                                                order_by = floor_start_dh)
                ) %>%
              filter(as.numeric(timediff) != 1) %>%
              nrow(.) == 0,
            msg="Time diffs. between obs are not always 1 hour")

## [1] TRUE

Problem 3:

(a) Estimate a linear regression model for each of the stations. Using factors for weekdays and hour of day as explanatory variables and n_rides as the response variable. Determine the exact specification as you see fit.

# Split input data into train set and test set (grouped by station id, train/test ratio is 3/1)
by_station <- df_agg %>% group_by(start_station_id) %>% 
  summarize(split = list(initial_time_split(cur_data()))) %>%
  group_by(start_station_id) %>% 
  mutate(data.train=list(training(split[[1]])),data.test=list(testing(split[[1]]))) %>%
  select(!split)
by_station

## # A tibble: 106 × 3
## # Groups:   start_station_id [106]
##    start_station_id data.train           data.test           
##               <dbl> <list>               <list>              
##  1                2 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  2                3 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  3                5 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  4                7 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  5                8 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  6               22 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  7               24 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  8               33 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
##  9               34 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
## 10               36 <tibble [6,354 × 4]> <tibble [2,118 × 4]>
## # … with 96 more rows

map(setNames(by_station$data.train, by_station$start_station_id)[1:2], summary)

## $`2`
##  floor_start_dh                   n_rides         start_hour   weekday_start
##  Min.   :2021-01-01 00:00:00   Min.   :0.0000   0      : 265   1:911        
##  1st Qu.:2021-03-08 04:15:00   1st Qu.:0.0000   1      : 265   2:912        
##  Median :2021-05-13 09:30:00   Median :0.0000   3      : 265   3:912        
##  Mean   :2021-05-13 09:30:00   Mean   :0.2117   4      : 265   4:907        
##  3rd Qu.:2021-07-18 13:45:00   3rd Qu.:0.0000   5      : 265   5:888        
##  Max.   :2021-09-22 18:00:00   Max.   :9.0000   6      : 265   6:912        
##                                                 (Other):4764   7:912        
## 
## $`3`
##  floor_start_dh                   n_rides         start_hour   weekday_start
##  Min.   :2021-01-01 00:00:00   Min.   : 0.000   0      : 265   1:911        
##  1st Qu.:2021-03-08 04:15:00   1st Qu.: 0.000   1      : 265   2:912        
##  Median :2021-05-13 09:30:00   Median : 1.000   3      : 265   3:912        
##  Mean   :2021-05-13 09:30:00   Mean   : 1.376   4      : 265   4:907        
##  3rd Qu.:2021-07-18 13:45:00   3rd Qu.: 2.000   5      : 265   5:888        
##  Max.   :2021-09-22 18:00:00   Max.   :14.000   6      : 265   6:912        
##                                                 (Other):4764   7:912

map(setNames(by_station$data.test, by_station$start_station_id)[1:2], summary)

## $`2`
##  floor_start_dh                   n_rides          start_hour   weekday_start
##  Min.   :2021-09-22 19:00:00   Min.   :0.00000   2      :  89   1:313        
##  1st Qu.:2021-10-14 20:15:00   1st Qu.:0.00000   19     :  89   2:288        
##  Median :2021-11-05 20:30:00   Median :0.00000   20     :  89   3:288        
##  Mean   :2021-11-05 20:30:00   Mean   :0.04627   21     :  89   4:293        
##  3rd Qu.:2021-11-27 21:45:00   3rd Qu.:0.00000   22     :  89   5:312        
##  Max.   :2021-12-19 23:00:00   Max.   :3.00000   23     :  89   6:312        
##                                                  (Other):1584   7:312        
## 
## $`3`
##  floor_start_dh                   n_rides          start_hour   weekday_start
##  Min.   :2021-09-22 19:00:00   Min.   : 0.0000   2      :  89   1:313        
##  1st Qu.:2021-10-14 20:15:00   1st Qu.: 0.0000   19     :  89   2:288        
##  Median :2021-11-05 20:30:00   Median : 0.0000   20     :  89   3:288        
##  Mean   :2021-11-05 20:30:00   Mean   : 0.9259   21     :  89   4:293        
##  3rd Qu.:2021-11-27 21:45:00   3rd Qu.: 1.0000   22     :  89   5:312        
##  Max.   :2021-12-19 23:00:00   Max.   :12.0000   23     :  89   6:312        
##                                                  (Other):1584   7:312

# A function that fits the linear regression model to each station
station_model <- function(df) {
  lm(n_rides~weekday_start + start_hour, data=df)
}
# put the linear model to each station by using purrr:map() to apply the model to each element
models <- map(by_station$data.train, station_model)
# Put the model inside a mutate
by_station <- by_station %>%
  group_by(start_station_id) %>%
  mutate(model = map(data.train, station_model)) %>% 
  ungroup()
by_station

## # A tibble: 106 × 4
##    start_station_id data.train           data.test            model 
##               <dbl> <list>               <list>               <list>
##  1                2 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  2                3 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  3                5 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  4                7 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  5                8 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  6               22 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  7               24 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  8               33 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
##  9               34 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
## 10               36 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>  
## # … with 96 more rows

# Check models of station 2 and station 3
by_station %>% 
  filter(start_station_id == 2) %>% 
  pluck("model", 1) %>% 
  summary()

## 
## Call:
## lm(formula = n_rides ~ weekday_start + start_hour, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8684 -0.2352 -0.1003 -0.0110  8.3663 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -9.105e-02  4.073e-02  -2.235 0.025422 *  
## weekday_start2  1.021e-01  2.779e-02   3.673 0.000242 ***
## weekday_start3  1.317e-01  2.779e-02   4.739 2.20e-06 ***
## weekday_start4  1.368e-01  2.783e-02   4.915 9.09e-07 ***
## weekday_start5  1.306e-01  2.798e-02   4.669 3.09e-06 ***
## weekday_start6  1.054e-01  2.779e-02   3.792 0.000151 ***
## weekday_start7  3.190e-02  2.779e-02   1.148 0.251060    
## start_hour1    -1.867e-15  5.154e-02   0.000 1.000000    
## start_hour2    -3.449e-04  5.159e-02  -0.007 0.994666    
## start_hour3     3.774e-03  5.154e-02   0.073 0.941634    
## start_hour4     1.887e-02  5.154e-02   0.366 0.714304    
## start_hour5     1.660e-01  5.154e-02   3.222 0.001281 ** 
## start_hour6     1.660e-01  5.154e-02   3.222 0.001281 ** 
## start_hour7     1.547e-01  5.154e-02   3.002 0.002693 ** 
## start_hour8     1.774e-01  5.154e-02   3.441 0.000583 ***
## start_hour9     2.377e-01  5.154e-02   4.613 4.05e-06 ***
## start_hour10    2.679e-01  5.154e-02   5.199 2.07e-07 ***
## start_hour11    3.434e-01  5.154e-02   6.663 2.91e-11 ***
## start_hour12    5.245e-01  5.154e-02  10.178  < 2e-16 ***
## start_hour13    8.226e-01  5.154e-02  15.962  < 2e-16 ***
## start_hour14    6.226e-01  5.154e-02  12.081  < 2e-16 ***
## start_hour15    4.566e-01  5.154e-02   8.860  < 2e-16 ***
## start_hour16    2.943e-01  5.154e-02   5.711 1.17e-08 ***
## start_hour17    2.226e-01  5.154e-02   4.320 1.58e-05 ***
## start_hour18    1.774e-01  5.154e-02   3.441 0.000583 ***
## start_hour19    1.365e-01  5.159e-02   2.647 0.008147 ** 
## start_hour20    2.161e-01  5.159e-02   4.189 2.84e-05 ***
## start_hour21    6.078e-02  5.159e-02   1.178 0.238763    
## start_hour22    7.749e-03  5.159e-02   0.150 0.880602    
## start_hour23    1.732e-04  5.159e-02   0.003 0.997321    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5932 on 6324 degrees of freedom
## Multiple R-squared:  0.1183, Adjusted R-squared:  0.1143 
## F-statistic: 29.26 on 29 and 6324 DF,  p-value: < 2.2e-16

by_station %>% 
  filter(start_station_id == 3) %>% 
  pluck("model", 1) %>% 
  summary()

## 
## Call:
## lm(formula = n_rides ~ weekday_start + start_hour, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6613 -0.9598 -0.1968  0.5184 11.5380 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -5.195e-01  1.119e-01  -4.643 3.50e-06 ***
## weekday_start2  5.302e-01  7.633e-02   6.946 4.14e-12 ***
## weekday_start3  7.572e-01  7.633e-02   9.919  < 2e-16 ***
## weekday_start4  8.110e-01  7.644e-02  10.610  < 2e-16 ***
## weekday_start5  7.152e-01  7.685e-02   9.306  < 2e-16 ***
## weekday_start6  6.091e-01  7.633e-02   7.980 1.73e-15 ***
## weekday_start7  2.188e-01  7.633e-02   2.866  0.00417 ** 
## start_hour1     6.940e-15  1.416e-01   0.000  1.00000    
## start_hour2    -1.968e-03  1.417e-01  -0.014  0.98892    
## start_hour3     3.019e-02  1.416e-01   0.213  0.83114    
## start_hour4     4.189e-01  1.416e-01   2.959  0.00310 ** 
## start_hour5     9.774e-01  1.416e-01   6.904 5.56e-12 ***
## start_hour6     2.117e+00  1.416e-01  14.954  < 2e-16 ***
## start_hour7     1.668e+00  1.416e-01  11.782  < 2e-16 ***
## start_hour8     1.479e+00  1.416e-01  10.449  < 2e-16 ***
## start_hour9     1.513e+00  1.416e-01  10.689  < 2e-16 ***
## start_hour10    1.517e+00  1.416e-01  10.716  < 2e-16 ***
## start_hour11    1.834e+00  1.416e-01  12.955  < 2e-16 ***
## start_hour12    1.981e+00  1.416e-01  13.994  < 2e-16 ***
## start_hour13    2.608e+00  1.416e-01  18.419  < 2e-16 ***
## start_hour14    3.370e+00  1.416e-01  23.803  < 2e-16 ***
## start_hour15    3.053e+00  1.416e-01  21.564  < 2e-16 ***
## start_hour16    2.675e+00  1.416e-01  18.899  < 2e-16 ***
## start_hour17    2.234e+00  1.416e-01  15.780  < 2e-16 ***
## start_hour18    1.883e+00  1.416e-01  13.301  < 2e-16 ***
## start_hour19    1.372e+00  1.417e-01   9.685  < 2e-16 ***
## start_hour20    1.221e+00  1.417e-01   8.615  < 2e-16 ***
## start_hour21    9.178e-01  1.417e-01   6.477 1.01e-10 ***
## start_hour22    1.375e-01  1.417e-01   0.970  0.33202    
## start_hour23    1.104e-03  1.417e-01   0.008  0.99378    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.63 on 6324 degrees of freedom
## Multiple R-squared:  0.2936, Adjusted R-squared:  0.2903 
## F-statistic: 90.63 on 29 and 6324 DF,  p-value: < 2.2e-16

(b) Predict the number of rides for each hour of day, weekday and station using these regression models. Collect the predictions in a data structure, together with the station ID.

by_station <- by_station %>%
  group_by(start_station_id) %>%
  mutate(predictions = list(predict(model[[1]], newdata=data.test[[1]]))) %>%
  ungroup()
by_station

## # A tibble: 106 × 5
##    start_station_id data.train           data.test            model  predictions
##               <dbl> <list>               <list>               <list> <list>     
##  1                2 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  2                3 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  3                5 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  4                7 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  5                8 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  6               22 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  7               24 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  8               33 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
##  9               34 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
## 10               36 <tibble [6,354 × 4]> <tibble [2,118 × 4]> <lm>   <dbl [2,11…
## # … with 96 more rows

prediction_df_from_station_nr <- function(station_nr) {
  by_station %>%
  filter(start_station_id == station_nr) %>%
  ungroup() %>%
  select(c(data.test, predictions)) %>%
  unnest(c(data.test, predictions)) %>%
  mutate(start_station_id = station_nr)
}

predictions_from_all_stations <- bind_rows(map(by_station$start_station_id, prediction_df_from_station_nr))
predictions_from_all_stations

## # A tibble: 224,508 × 6
##    floor_start_dh      n_rides start_hour weekday_start predictions
##    <dttm>                <dbl> <fct>      <fct>               <dbl>
##  1 2021-09-22 19:00:00       0 19         4                  0.182 
##  2 2021-09-22 20:00:00       0 20         4                  0.262 
##  3 2021-09-22 21:00:00       0 21         4                  0.107 
##  4 2021-09-22 22:00:00       0 22         4                  0.0535
##  5 2021-09-22 23:00:00       0 23         4                  0.0459
##  6 2021-09-23 00:00:00       0 0          5                  0.0396
##  7 2021-09-23 01:00:00       0 1          5                  0.0396
##  8 2021-09-23 02:00:00       0 2          5                  0.0392
##  9 2021-09-23 03:00:00       0 3          5                  0.0433
## 10 2021-09-23 04:00:00       0 4          5                  0.0584
## # … with 224,498 more rows, and 1 more variable: start_station_id <dbl>

(c) Create a set of plots showing the predicted number of bicycle rides throughout an entire week.

Place predicted volume on the y-axis and the hour of day on the x-axis. Make one plot per day of week. Use different colors for each station, and format the plots as appropriate.

predictions_from_all_stations %>%
  filter(floor_start_dh > "2021-10-01 00:00:00", floor_start_dh <= "2021-10-08 00:00:00") %>%
    ggplot(aes(x = as.numeric(start_hour), y = predictions, color = factor(start_station_id))) +
    geom_line() +
    labs(
      x="start_hour",
      y="predictions",
      colour="start_station_id",
      title="Hourly bicycle ride count prediction per station from 2021-10-01 to 2021-10-07",
      ) + 
    facet_wrap(vars(weekday_start), scales="fixed", ncol=1)

Problem 4:

Create a function with the following specifications

• The function should, as a minimum, take as arguments the date and the hour.

• The function should return a plot with the following properties:

– The latitude and the longitude should be mapped to the x and the y-axis respectively.

– The plot should give information about the traffic volume (or predicted traffic volume) for each of the stations in the data set at the given time.

– The plot should be well formatted.

Note that the latitude and longitude of the stations may change over time. Use your best judgement to overcome this problem in order to make a figure that informs the reader of where bicycle traffic is originating from in Bergen at given times.

# The latitude and longitude of the stations may change over time -> calculate the average lon and lat of each station through time
station  <- bike_rides_2021_data %>% select(start_station_id, start_station_longitude, start_station_latitude) %>% 
  group_by(start_station_id) %>% 
  summarise(lon = sum(start_station_longitude)/n(), lat = sum(start_station_latitude)/n())
station

## # A tibble: 106 × 3
##    start_station_id   lon   lat
##               <int> <dbl> <dbl>
##  1                2  5.35  60.4
##  2                3  5.33  60.4
##  3                5  5.34  60.4
##  4                7  5.32  60.4
##  5                8  5.36  60.4
##  6               22  5.32  60.4
##  7               24  5.35  60.4
##  8               33  5.35  60.4
##  9               34  5.33  60.4
## 10               36  5.32  60.4
## # … with 96 more rows

# join data of lon and lat of each station into the df_agg
df_agg_lonlat <- left_join(df_agg, station , by = c("start_station_id" = "start_station_id"))

head(df_agg_lonlat,10)

##         floor_start_dh start_station_id n_rides start_hour weekday_start
## 1  2021-01-01 00:00:00                2       0          0             6
## 2  2021-01-01 01:00:00                2       0          1             6
## 3  2021-01-01 02:00:00                2       0          2             6
## 4  2021-01-01 03:00:00                2       0          3             6
## 5  2021-01-01 04:00:00                2       0          4             6
## 6  2021-01-01 05:00:00                2       0          5             6
## 7  2021-01-01 06:00:00                2       0          6             6
## 8  2021-01-01 07:00:00                2       0          7             6
## 9  2021-01-01 08:00:00                2       0          8             6
## 10 2021-01-01 09:00:00                2       0          9             6
##         lon      lat
## 1  5.351084 60.37525
## 2  5.351084 60.37525
## 3  5.351084 60.37525
## 4  5.351084 60.37525
## 5  5.351084 60.37525
## 6  5.351084 60.37525
## 7  5.351084 60.37525
## 8  5.351084 60.37525
## 9  5.351084 60.37525
## 10 5.351084 60.37525

Create a function

plot_map <- function (date_and_hour) {
  df <- filter(df_agg_lonlat, floor_start_dh == date_and_hour)
  bergen <- make_bbox(df$lon, 
          df$lat, 
          f = .05)
  m <- get_map(bergen, source = "osm")
ggmap(m) + 
  geom_point(aes(x = lon, y = lat, size = n_rides, colour = n_rides), data = df) + 
  scale_colour_gradientn(colours = terrain.colors(9)) +
  geom_text(data = df, aes(label=n_rides), size = 2) +
  labs(
      x="longitude",
      y="latitude",
      colour="number of rides",
      size = "number of rides",
      title = paste0("Bicycle traffic volume in Bergen at ", date_and_hour)
      ) +
  theme(axis.text.x = element_text(angle=-90, hjust=0, vjust=1), 
     plot.title = element_text(size = 12, face = "bold", colour = "black", hjust = 0.5))}

# test the plot with a specific date and hour
plot_map("2021-06-08 13:00:00")

plot_map("2021-10-08 16:00:00")

Home Exam - BAN400

Nguyen Thi Hoa

12/20/2021

Load all needed packages

Problem 1:

Problem 2:

Problem 3:

Problem 4: